################################################################################
# Evaluation of Finetuning Hate Speech Classifier with different 
# Annotation Datasets. (We use gpt-4o-mini-2024-07-18)
################################################################################
################################################################################
# Libraries
################################################################################
library(dplyr)
library(tidyr)
library(readr)
library(pbmcapply)
library(stringr)
library(tidymodels)
library(caret)
library(lubridate)
library(ggplot2)
library(ggthemes)
library(scales)
library(irr)
library(tidycomm)
library(forestmangr)
library(ggcorrplot)
library(RColorBrewer)
library(cvms)
library(cowplot)
library(binom)
library(boot)
library(xtable)
################################################################################
# Setup
################################################################################
rm(list = ls())

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
parent_path <- getwd()
getwd()

set.seed(123456789)
################################################################################
# Load Data 
################################################################################
# Fine-tuned predictions with 100 rows
#---------------------------------------------------------------
expert_df_100 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_1_task_1/ds_1_t_1_trn_100-finetune_chetGPT_hatespeech_experts.csv")
ngo_df_100 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_2_task_1/ds_2_t_1_trn_100-finetune_chetGPT_hatespeech_ngo.csv")
appen_df_100 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_3_task_1/ds_3_t_1_trn_100-finetune_chetGPT_hatespeech_appen.csv")
citi_df_100 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_4_task_1/ds_4_t_1_trn_100-finetune_chetGPT_hatespeech_citi.csv")
prolific_df_100 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_5_task_1/ds_5_t_1_trn_100-finetune_chetGPT_hatespeech_prolific.csv")
ras_df_100 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_6_task_1/ds_6_t_1_trn_100-finetune_chetGPT_hatespeech_research_assistants.csv")
# Fine-tuned predictions with 250 rows
#---------------------------------------------------------------
expert_df_250 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_1_task_1/ds_1_t_1_trn_250-finetune_chetGPT_hatespeech_experts.csv")
ngo_df_250 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_2_task_1/ds_2_t_1_trn_250-finetune_chetGPT_hatespeech_ngo.csv")
appen_df_250 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_3_task_1/ds_3_t_1_trn_250-finetune_chetGPT_hatespeech_appen.csv")
citi_df_250 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_4_task_1/ds_4_t_1_trn_250-finetune_chetGPT_hatespeech_citi.csv")
prolific_df_250 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_5_task_1/ds_5_t_1_trn_250-finetune_chetGPT_hatespeech_prolific.csv")
ras_df_250 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_6_task_1/ds_6_t_1_trn_250-finetune_chetGPT_hatespeech_research_assistants.csv")

gpt_df_100 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_7_task_1/ds_7_t_1_trn_100-finetune_chetGPT_hatespeech_gpt_zero.csv")
gpt_df_250 <- read_csv("../annotations/chatGPT/output_fine_tuning/predictions_2/dataset_7_task_1/ds_7_t_1_trn_250-finetune_chetGPT_hatespeech_gpt_zero.csv")


# Baseline Annotations
#---------------------------------------------------------------
experts <- read_csv("../annotations/experts/experts_for_annots_16062023_master.csv") 
experts_long <- read_csv("../annotations/experts/experts_for_annots_16062023_master_long.csv")
appen <- read_csv("../annotations/Appen/appen_set_29062022_for_analysis.csv")
ngo <- read_csv("../annotations/NGO/ngo_set_23062022_for_analysis.csv")
prolific <- read_csv("../annotations/Prolific/prolific_set_06042023_for_analysis.csv")
ras <- read_csv("../annotations/RAs/ra_set_29062022_for_analysis.csv")
citi <- read_csv("../annotations/Citizen Science/288_stop_hate_speech_task_run_csv/stop_hate_speech_task_run.csv")
chatgpt_sensitivity_def_version<- read_csv("../annotations/chatGPT/chatgpt_set_zeroshot_sensitivity_check_for_analysis.csv")
chatgpt_un_def_version<- read_csv("../annotations/chatGPT/chatgpt_set_zeroshot_un_def_for_analysis.csv") 
chatgpt_main_def_version <- read_csv("../annotations/chatGPT/chatgpt_set_zeroshot_main_def_for_analysis.csv") 
################################################################################
# ICC of groups 3 Way
################################################################################
appen <- appen %>% mutate(unit_var = paste0(newuniqueid)) %>% mutate(is_hs_tox = ifelse(ishatespeech == 1 & toxic == 1, "Toxic",
                                                                                         ifelse(ishatespeech == 1 & toxic == 0, "Hate Speech", "Neither")))
icr_appen <- test_icr(appen, unit_var = unit_var, coder_var = annotator, ishatespeech, toxic, is_hs_tox)


ngo <- ngo %>% mutate(unit_var = paste0(newuniqueid)) %>% mutate(is_hs_tox = ifelse(ishatespeech == 1 & toxic == 1, "Toxic",
                                                                                                 ifelse(ishatespeech == 1 & toxic == 0, "Hate Speech", "Neither")))
icr_ngo <- test_icr(ngo, unit_var = unit_var, coder_var = annotator, ishatespeech,toxic,is_hs_tox)


prolific <- prolific %>% mutate(unit_var = paste0(newuniqueid)) %>% mutate(is_hs_tox = ifelse(ishatespeech == 1 & toxic == 1, "Toxic",
                                                                                               ifelse(ishatespeech == 1 & toxic == 0, "Hate Speech", "Neither")))
icr_proli <- test_icr(prolific, unit_var = unit_var, coder_var = annotator, ishatespeech,toxic,is_hs_tox)


ras <- ras %>% mutate(unit_var = paste0(newuniqueid)) %>% mutate(is_hs_tox = ifelse(ishatespeech == 1 & toxic == 1, "Toxic",
                                                                                     ifelse(ishatespeech == 1 & toxic == 0, "Hate Speech", "Neither")))
icr_reass <- test_icr(ras, unit_var = unit_var, coder_var = annotator, ishatespeech,toxic,is_hs_tox)


citi <- citi %>% mutate(unit_var = paste0(newuniqueid)) %>% mutate(is_hs_tox = ifelse(ishatespeech == 1 & toxic == 1, "Toxic",
                                                                                       ifelse(ishatespeech == 1 & toxic == 0, "Hate Speech", "Neither")))
icr_citi <- test_icr(citi, unit_var = unit_var, coder_var = annotator, ishatespeech,toxic,is_hs_tox)


chatgpt_un_def_version <- chatgpt_un_def_version %>% mutate(unit_var = paste0(newuniqueid)) %>% mutate(is_hs_tox = ifelse(ishatespeech == 1 & toxic == 1, "Toxic",
                                                                                                                           ifelse(ishatespeech == 1 & toxic == 0, "Hate Speech", "Neither")))
icr_chat_n <- test_icr(chatgpt_un_def_version, unit_var = unit_var, coder_var = annotator, ishatespeech,toxic,is_hs_tox)


chatgpt_sensitivity_def_version <- chatgpt_sensitivity_def_version %>% mutate(unit_var = paste0(newuniqueid)) %>% mutate(is_hs_tox = ifelse(ishatespeech == 1 & toxic == 1, "Toxic",
                                                                                                                                             ifelse(ishatespeech == 1 & toxic == 0, "Hate Speech", "Neither")))
icr_chat_s <- test_icr(chatgpt_sensitivity_def_version, unit_var = unit_var, coder_var = annotator, ishatespeech,toxic,is_hs_tox)


chatgpt_main_def_version <- chatgpt_main_def_version %>% mutate(unit_var = paste0(newuniqueid)) %>% mutate(is_hs_tox = ifelse(ishatespeech == 1 & toxic == 1, "Toxic",
                                                                                                                               ifelse(ishatespeech == 1 & toxic == 0, "Hate Speech", "Neither")))
icr_chat_m <- test_icr(chatgpt_main_def_version, unit_var = unit_var, coder_var = annotator, ishatespeech,toxic,is_hs_tox)


experts_long <- experts_long %>% mutate(unit_var = paste0(newuniqueid)) %>% mutate(is_hs_tox = ifelse(ishatespeech == 1 & toxic_2 == 1, "Toxic",
                                                                                                       ifelse(ishatespeech == 1 & toxic_2 == 0, "Hate Speech", "Neither")))
icr_experts <- test_icr(experts_long, unit_var = unit_var, coder_var = annotator, ishatespeech,toxic_2,is_hs_tox)

icr_table <- dplyr::bind_rows(icr_experts,icr_reass,icr_ngo,icr_proli,icr_appen,icr_citi,icr_chat_n,icr_chat_s,icr_chat_m) %>% round_df(., digits = 3)
icr_table$Platform <- rep(c("Experts", "Research Assistants",  "NGO",  "Prolific",  "Appen", "Citizen Science", "Chat GPT (United Nations Definition)", "Chat GPT (Sensitivity Check)","Chat GPT"),each = 3)
################################################################################
# Merge Annotation Sets and make sure Hate Speech is now a three way thing too!
################################################################################
# Make all sets to wide format...
# Change Name of isHateSpeech values for all annotators to something of use...
appen_wide <- appen %>% group_by(newuniqueid) %>%
  pivot_wider(names_from = annotator, 
              values_from = c(ishatespeech,target_group,toxic,is_hs_tox)) %>%
  rename(ishatespeech_1 = ishatespeech_appen_1,
         ishatespeech_2 = ishatespeech_appen_2,
         ishatespeech_3 = ishatespeech_appen_3,
         target_group_1 = target_group_appen_1,
         target_group_2 = target_group_appen_2,
         target_group_3 = target_group_appen_3,
         istoxic_1 = toxic_appen_1,
         istoxic_2 = toxic_appen_2,
         istoxic_3 = toxic_appen_3,
         is_hs_tox_1 = is_hs_tox_appen_1,
         is_hs_tox_2 = is_hs_tox_appen_2,
         is_hs_tox_3 = is_hs_tox_appen_3) %>%
  mutate(ishate_combined = ifelse(ishatespeech_1 + ishatespeech_2 + ishatespeech_3 >= 2, 1, 0),
         istoxic_combined = ifelse(istoxic_1 + istoxic_2 + istoxic_3 >= 2, 1, 0),
         target_combined = ifelse(target_group_1 == target_group_2, target_group_1,
                                  ifelse(target_group_2 == target_group_3, target_group_3,
                                         ifelse(target_group_1 == target_group_3, target_group_1,
                                                ""))))


ngo_wide <- ngo %>% group_by(newuniqueid) %>% 
  pivot_wider(names_from = annotator, 
              values_from = c(ishatespeech,target_group,toxic,is_hs_tox)) %>% 
  rename(ishatespeech_1 = ishatespeech_ngo_1,
         ishatespeech_2 = ishatespeech_ngo_2,
         ishatespeech_3 = ishatespeech_ngo_3,
         target_group_1 = target_group_ngo_1,
         target_group_2 = target_group_ngo_2,
         target_group_3 = target_group_ngo_3,
         istoxic_1 = toxic_ngo_1,
         istoxic_2 = toxic_ngo_2,
         istoxic_3 = toxic_ngo_3,
         is_hs_tox_1 = is_hs_tox_ngo_1,
         is_hs_tox_2 = is_hs_tox_ngo_2,
         is_hs_tox_3 = is_hs_tox_ngo_3) %>%
  mutate(ishate_combined = ifelse(ishatespeech_1 + ishatespeech_2 + ishatespeech_3 >= 2, 1, 0),
         istoxic_combined = ifelse(istoxic_1 + istoxic_2 + istoxic_3 >= 2, 1, 0),
         target_combined = ifelse(target_group_1 == target_group_2, target_group_1,
                                  ifelse(target_group_2 == target_group_3, target_group_3,
                                         ifelse(target_group_1 == target_group_3, target_group_1,
                                                ""))))

prolific_wide <- prolific %>% group_by(newuniqueid) %>% 
  pivot_wider(names_from = annotator, 
              values_from = c(ishatespeech,target_group,toxic,is_hs_tox)) %>%
  rename(ishatespeech_1 = ishatespeech_prolific1,
         ishatespeech_2 = ishatespeech_prolific2,
         ishatespeech_3 = ishatespeech_prolific3,
         target_group_1 = target_group_prolific1,
         target_group_2 = target_group_prolific2,
         target_group_3 = target_group_prolific3,
         istoxic_1 = toxic_prolific1,
         istoxic_2 = toxic_prolific2,
         istoxic_3 = toxic_prolific3,
         is_hs_tox_1 = is_hs_tox_prolific1,
         is_hs_tox_2 = is_hs_tox_prolific2,
         is_hs_tox_3 = is_hs_tox_prolific3) %>%
  mutate(ishate_combined = ifelse(ishatespeech_1 + ishatespeech_2 + ishatespeech_3 >= 2, 1, 0),
         target_combined = case_when(
           target_group_1 == "other" & target_group_2 == "other" & target_group_3 == "other" ~ "other",
           target_group_1 == target_group_2 ~ target_group_1,
           target_group_2 == target_group_3 ~ target_group_2,
           target_group_1 == target_group_3 ~ target_group_1,
           TRUE ~ ""),
         istoxic_combined = case_when(
           target_combined == "other" ~ 0,
           istoxic_1 + istoxic_2 + istoxic_3 >= 2 ~ 1,
           TRUE ~ 0
         ))

ras_wide <- ras %>% group_by(newuniqueid) %>%
  pivot_wider(names_from = annotator, 
              values_from = c(ishatespeech,target_group,toxic,is_hs_tox)) %>% 
  rename(ishatespeech_1 = ishatespeech_ra_1,
         ishatespeech_2 = ishatespeech_ra_2,
         ishatespeech_3 = ishatespeech_ra_3,
         target_group_1 = target_group_ra_1,
         target_group_2 = target_group_ra_2,
         target_group_3 = target_group_ra_3,
         istoxic_1 = toxic_ra_1,
         istoxic_2 = toxic_ra_2,
         istoxic_3 = toxic_ra_3,
         is_hs_tox_1 = is_hs_tox_ra_1,
         is_hs_tox_2 = is_hs_tox_ra_2,
         is_hs_tox_3 = is_hs_tox_ra_3) %>% 
  mutate(ishate_combined = ifelse(ishatespeech_1 + ishatespeech_2 + ishatespeech_3 >= 2, 1, 0),
         istoxic_combined = ifelse(istoxic_1 + istoxic_2 + istoxic_3 >= 2, 1, 0),
         target_combined = ifelse(target_group_1 == target_group_2, target_group_1,
                                  ifelse(target_group_2 == target_group_3, target_group_3,
                                         ifelse(target_group_1 == target_group_3, target_group_1,
                                                ""))))

citi_wide <- citi %>% group_by(newuniqueid) %>%
  pivot_wider(names_from = annotator, 
              values_from = c(ishatespeech,target_group,toxic,is_hs_tox)) %>%
  rename(ishatespeech_1 = ishatespeech_1,
         ishatespeech_2 = ishatespeech_2,
         ishatespeech_3 = ishatespeech_3,
         target_group_1 = target_group_1,
         target_group_2 = target_group_2,
         target_group_3 = target_group_3,
         istoxic_1 = toxic_1,
         istoxic_2 = toxic_2,
         istoxic_3 = toxic_3,
         is_hs_tox_1 = is_hs_tox_1,
         is_hs_tox_2 = is_hs_tox_2,
         is_hs_tox_3 = is_hs_tox_3) %>% 
  mutate(ishate_combined = ifelse(ishatespeech_1 + ishatespeech_2 + ishatespeech_3 >= 2, 1, 0),
         istoxic_combined = ifelse(istoxic_1 + istoxic_2 + istoxic_3 >= 2, 1, 0),
         target_combined = ifelse(target_group_1 == target_group_2, target_group_1,
                                  ifelse(target_group_2 == target_group_3, target_group_3,
                                         ifelse(target_group_1 == target_group_3, target_group_1,
                                                ""))))


chat_wide_n <- chatgpt_un_def_version %>% group_by(newuniqueid) %>%
  pivot_wider(names_from = annotator, 
              values_from = c(ishatespeech,target_group,toxic,is_hs_tox)) %>%
  rename(ishatespeech_1 = ishatespeech_1,
         ishatespeech_2 = ishatespeech_2,
         ishatespeech_3 = ishatespeech_3,
         target_group_1 = target_group_1,
         target_group_2 = target_group_2,
         target_group_3 = target_group_3,
         istoxic_1 = toxic_1,
         istoxic_2 = toxic_2,
         istoxic_3 = toxic_3,
         is_hs_tox_1 = is_hs_tox_1,
         is_hs_tox_2 = is_hs_tox_2,
         is_hs_tox_3 = is_hs_tox_3) %>% 
  mutate(ishate_combined = ifelse(ishatespeech_1 + ishatespeech_2 + ishatespeech_3 >= 2, 1, 0),
         istoxic_combined = ifelse(istoxic_1 + istoxic_2 + istoxic_3 >= 2, 1, 0),
         target_combined = ifelse(target_group_1 == target_group_2, target_group_1,
                                  ifelse(target_group_2 == target_group_3, target_group_3,
                                         ifelse(target_group_1 == target_group_3, target_group_1,
                                                ""))))

chat_wide_s <- chatgpt_sensitivity_def_version %>% group_by(newuniqueid) %>%
  pivot_wider(names_from = annotator, 
              values_from = c(ishatespeech,target_group,toxic,is_hs_tox)) %>%
  rename(ishatespeech_1 = ishatespeech_1,
         ishatespeech_2 = ishatespeech_2,
         ishatespeech_3 = ishatespeech_3,
         target_group_1 = target_group_1,
         target_group_2 = target_group_2,
         target_group_3 = target_group_3,
         istoxic_1 = toxic_1,
         istoxic_2 = toxic_2,
         istoxic_3 = toxic_3,
         is_hs_tox_1 = is_hs_tox_1,
         is_hs_tox_2 = is_hs_tox_2,
         is_hs_tox_3 = is_hs_tox_3) %>% 
  mutate(ishate_combined = ifelse(ishatespeech_1 + ishatespeech_2 + ishatespeech_3 >= 2, 1, 0),
         istoxic_combined = ifelse(istoxic_1 + istoxic_2 + istoxic_3 >= 2, 1, 0),
         target_combined = ifelse(target_group_1 == target_group_2, target_group_1,
                                  ifelse(target_group_2 == target_group_3, target_group_3,
                                         ifelse(target_group_1 == target_group_3, target_group_1,
                                                ""))))


chat_wide_m <- chatgpt_main_def_version %>% group_by(newuniqueid) %>%
  pivot_wider(names_from = annotator, 
              values_from = c(ishatespeech,target_group,toxic,is_hs_tox)) %>%
  rename(ishatespeech_1 = ishatespeech_1,
         ishatespeech_2 = ishatespeech_2,
         ishatespeech_3 = ishatespeech_3,
         target_group_1 = target_group_1,
         target_group_2 = target_group_2,
         target_group_3 = target_group_3,
         istoxic_1 = toxic_1,
         istoxic_2 = toxic_2,
         istoxic_3 = toxic_3,
         is_hs_tox_1 = is_hs_tox_1,
         is_hs_tox_2 = is_hs_tox_2,
         is_hs_tox_3 = is_hs_tox_3) %>% 
  mutate(ishate_combined = ifelse(ishatespeech_1 + ishatespeech_2 + ishatespeech_3 >= 2, 1, 0),
         istoxic_combined = ifelse(istoxic_1 + istoxic_2 + istoxic_3 >= 2, 1, 0),
         target_combined = ifelse(target_group_1 == target_group_2, target_group_1,
                                  ifelse(target_group_2 == target_group_3, target_group_3,
                                         ifelse(target_group_1 == target_group_3, target_group_1,
                                                ""))))





experts_wide <- experts %>% mutate(unit_var = paste0(ArticleID,ID)) %>%
  select(c(ArticleID,ID,Kommentar,Titel,Text,group,unit_var,`Hate Speech_KD`,`Hate Speech_FG`,`Hate Speech_SK`,
           `Target Group_KD`,`Target Group_FG`,`Target Group_SK`,`Konsensus HS`,`Konsensus Target 1`,`Difficult case?`,`toxic`))

colnames(experts_wide) <- tolower(names(experts_wide))
colnames(experts_wide) <- c("articleid","id","kommentar","titel","text","group","unit_var","ishatespeech_1","ishatespeech_2","ishatespeech_3","target_group_1","target_group_2","target_group_3","ishate_combined","target_combined","difficult_case","istoxic_combined")

experts_wide <- experts_wide %>% mutate(target_combined = case_when(
  target_combined %in% c("Religion", "Herkunft / Religion") ~ "religion",
  target_combined %in% c("Nationalität/Hautfarbe/Herkunft", "Herkunft", "Nationalität", "Herkunft/Hautfarbe", "Herkunft, Religion","Nationalität/Herkunft") ~ "nationality",
  target_combined %in% c("Geschlecht", "Gechlecht", "Geschlecht/Herkunft", "Geschlecht, Herkunft", "Geschlecht, pol. Einstellung, Herkunft", "Körperliche Merkmale /Aussehen", "Aussehen, Geschlecht", "Aussehen") ~ "sex",
  target_combined %in% c("Politische Einstellung", "Pol. Einstellung") ~ "politics",
  target_combined == "Sexualität" ~ "sexuality",
  target_combined %in% c("Sozialer Status/Bildung/Einkommen/Berufsgruppe", "Beruf") ~ "social_status",
  target_combined == "Andere" ~ "other",
  target_combined == "Alter" ~ "age",
  target_combined == 0 ~ NA,
  TRUE ~ target_combined
))


# Merge sets
df_1 <- dplyr::bind_rows(experts_wide,ras_wide,ngo_wide,appen_wide,prolific_wide,citi_wide,chat_wide_n,chat_wide_s,chat_wide_m)
################################################################################
# Next ICR 
################################################################################
df_1 <- df_1 %>% ungroup()

df_1 <- df_1 %>% mutate(ishstox_combined = ifelse(ishate_combined == 1 & istoxic_combined == 0, "Hate Speech",
                                                  ifelse(ishate_combined == 1 & istoxic_combined == 1, "Toxic Speech", "Neither")))

icr_combined_h <- test_icr(df_1, unit_var = unit_var, coder_var = group, ishate_combined) %>% mutate(Platform = "Combined ICR (Hate Speech)")
icr_combined_t <- test_icr(df_1, unit_var = unit_var, coder_var = group, istoxic_combined) %>% mutate(Platform = "Combined ICR (Toxic Speech)")
icr_combined_3 <- test_icr(df_1, unit_var = unit_var, coder_var = group, ishstox_combined) %>% mutate(Platform = "Combined ICR (3 Classes)")

icr_table <- dplyr::bind_rows(icr_table, icr_combined_h, icr_combined_t, icr_combined_3) %>% round_df(., digits = 3)

# Add Agreement seen from the point of View of Gold Group (RA's at the moment)
is_hate_filter <- experts_wide %>% filter(ishate_combined == 1)
no_hate_filter <- experts_wide %>% filter(ishate_combined == 0)

is_toxic_filter <- experts_wide %>% filter(istoxic_combined == 1)
no_toxic_filter <- experts_wide %>% filter(istoxic_combined == 0)

hs_filter <- experts_wide %>% filter(ishate_combined == 1 & istoxic_combined == 0)
tx_filter <- experts_wide %>% filter(ishate_combined == 1 & istoxic_combined == 1)
no_filter <- experts_wide %>% filter(ishate_combined == 0 & istoxic_combined == 0)

nrow(hs_filter) + nrow(tx_filter) + nrow(no_filter)

four_way <- list(is_hate_filter$unit_var, no_hate_filter$unit_var, is_toxic_filter$unit_var, no_toxic_filter$unit_var)
thre_way <- list(hs_filter$unit_var, tx_filter$unit_var, no_filter$unit_var)

four_way_name <- c("Agreement Positive", "Agreement Negative")
four_way_name_unit <- c("n_Units Positive", "n_Units Negative")
four_way_krip <- c("Krippendorffs_Alpha Positive", "Krippendorffs_Alpha Negative")
four_way_holist <- c("Holstis_CR Positive", "Holstis_CR Negative")

for(i in 1:2){
  experts_long_sub <- experts_long %>% filter(unit_var %in% four_way[[i]])
  icr_experts_h <- test_icr(experts_long_sub, unit_var = unit_var, coder_var = annotator, ishatespeech)
  experts_long_sub <- experts_long %>% filter(unit_var %in% four_way[[i+2]])
  icr_experts_t <- test_icr(experts_long_sub, unit_var = unit_var, coder_var = annotator, toxic_2)
  
  experts_long_sub <- experts_long %>% filter(unit_var %in% thre_way[[i]])
  icr_experts_1 <- test_icr(experts_long_sub, unit_var = unit_var, coder_var = annotator, is_hs_tox)
  
  appen_sub <- appen %>% filter(unit_var %in% four_way[[i]])
  icr_appen_h <- test_icr(appen_sub, unit_var = unit_var, coder_var = annotator, ishatespeech)
  appen_sub <- appen %>% filter(unit_var %in% four_way[[i+2]])
  icr_appen_t <- test_icr(appen_sub, unit_var = unit_var, coder_var = annotator, toxic)
  
  appen_sub <- appen %>% filter(unit_var %in% thre_way[[i]])
  icr_appen_1 <- test_icr(appen_sub, unit_var = unit_var, coder_var = annotator, is_hs_tox)
  
  ngo_sub <- ngo %>% filter(unit_var %in% four_way[[i]])
  icr_ngo_h <- test_icr(ngo_sub, unit_var = unit_var, coder_var = annotator, ishatespeech)
  ngo_sub <- ngo %>% filter(unit_var %in% four_way[[i+2]])
  icr_ngo_t <- test_icr(ngo_sub, unit_var = unit_var, coder_var = annotator, toxic)
  
  ngo_sub <- ngo %>% filter(unit_var %in% thre_way[[i]])
  icr_ngo_1 <- test_icr(ngo_sub, unit_var = unit_var, coder_var = annotator, is_hs_tox)
  
  prolific_sub <- prolific %>% filter(unit_var %in% four_way[[i]])
  icr_proli_h <- test_icr(prolific_sub, unit_var = unit_var, coder_var = annotator, ishatespeech)
  prolific_sub <- prolific %>% filter(unit_var %in% four_way[[i+2]])
  icr_proli_t <- test_icr(prolific_sub, unit_var = unit_var, coder_var = annotator, toxic)
  
  prolific_sub <- prolific %>% filter(unit_var %in% thre_way[[i]])
  icr_proli_1 <- test_icr(prolific_sub, unit_var = unit_var, coder_var = annotator, is_hs_tox)
  
  ras_sub <- ras %>% filter(unit_var %in% four_way[[i]])
  icr_reass_h <- test_icr(ras_sub, unit_var = unit_var, coder_var = annotator, ishatespeech)
  ras_sub <- ras %>% filter(unit_var %in% four_way[[i+2]])
  icr_reass_t <- test_icr(ras_sub, unit_var = unit_var, coder_var = annotator, toxic)
  
  ras_sub <- ras %>% filter(unit_var %in% thre_way[[i]])
  icr_reass_1 <- test_icr(ras_sub, unit_var = unit_var, coder_var = annotator, is_hs_tox)
  
  citi_sub <- citi %>% filter(unit_var %in% four_way[[i]])
  icr_citi_h <- test_icr(citi_sub, unit_var = unit_var, coder_var = annotator, ishatespeech)
  citi_sub <- citi %>% filter(unit_var %in% four_way[[i+2]])
  icr_citi_t <- test_icr(citi_sub, unit_var = unit_var, coder_var = annotator, toxic)
  
  citi_sub <- citi %>% filter(unit_var %in% thre_way[[i]])
  icr_citi_1 <- test_icr(citi_sub, unit_var = unit_var, coder_var = annotator, is_hs_tox)
  
  chatgpt_sub_n <- chatgpt_un_def_version %>% filter(unit_var %in% four_way[[i]])
  icr_chat_n_h <- test_icr(chatgpt_sub_n, unit_var = unit_var, coder_var = annotator, ishatespeech)
  chatgpt_sub_n <- chatgpt_un_def_version %>% filter(unit_var %in% four_way[[i+2]])
  icr_chat_n_t <- test_icr(chatgpt_sub_n, unit_var = unit_var, coder_var = annotator, toxic)
  
  chatgpt_sub_n <- chatgpt_un_def_version %>% filter(unit_var %in% thre_way[[i]])
  icr_chat_n_1 <- test_icr(chatgpt_sub_n, unit_var = unit_var, coder_var = annotator, is_hs_tox)
  
  chatgpt_sub_s <- chatgpt_sensitivity_def_version %>% filter(unit_var %in% four_way[[i]])
  icr_chat_s_h <- test_icr(chatgpt_sub_s, unit_var = unit_var, coder_var = annotator, ishatespeech)
  chatgpt_sub_s <- chatgpt_sensitivity_def_version %>% filter(unit_var %in% four_way[[i+2]])
  icr_chat_s_t <- test_icr(chatgpt_sub_s, unit_var = unit_var, coder_var = annotator, toxic)
  
  chatgpt_sub_s <- chatgpt_sensitivity_def_version %>% filter(unit_var %in% thre_way[[i]])
  icr_chat_s_1 <- test_icr(chatgpt_sub_s, unit_var = unit_var, coder_var = annotator, is_hs_tox)
  
  chatgpt_sub_m <- chatgpt_main_def_version %>% filter(unit_var %in% four_way[[i]])
  icr_chat_m_h <- test_icr(chatgpt_sub_m, unit_var = unit_var, coder_var = annotator, ishatespeech)
  chatgpt_sub_m <- chatgpt_main_def_version %>% filter(unit_var %in% four_way[[i+2]])
  icr_chat_m_t <- test_icr(chatgpt_sub_m, unit_var = unit_var, coder_var = annotator, toxic)
  
  chatgpt_sub_m <- chatgpt_main_def_version %>% filter(unit_var %in% thre_way[[i]])
  icr_chat_m_1 <- test_icr(chatgpt_sub_m, unit_var = unit_var, coder_var = annotator, is_hs_tox)
  
  df_1_sub <- df_1 %>% filter(unit_var %in% four_way[[i]])
  icr_combined_h <- test_icr(df_1_sub, unit_var = unit_var, coder_var = group, ishate_combined) %>% mutate(Platform = "Combined ICR")
  df_1_sub <- df_1 %>% filter(unit_var %in% four_way[[i+2]])
  icr_combined_t <- test_icr(df_1_sub, unit_var = unit_var, coder_var = group, istoxic_combined) %>% mutate(Platform = "Combined ICR")
  df_1_sub <- df_1 %>% filter(unit_var %in% thre_way[[i]])
  icr_combined_1 <- test_icr(df_1_sub, unit_var = unit_var, coder_var = group, ishstox_combined) %>% mutate(Platform = "Combined ICR")
  
  
  icr_fourway <-  dplyr::bind_rows(icr_experts_h,icr_experts_t,icr_experts_1,icr_reass_h,icr_reass_t,icr_reass_1,
                                   icr_ngo_h,icr_ngo_t,icr_ngo_1,icr_proli_h,icr_proli_t,icr_proli_1,
                                   icr_appen_h,icr_appen_t,icr_appen_1,icr_citi_h,icr_citi_t,icr_citi_1,
                                   icr_chat_n_h,icr_chat_n_t,icr_chat_n_1,icr_chat_s_h,icr_chat_s_t,icr_chat_s_1,
                                   icr_chat_m_h,icr_chat_m_t,icr_chat_m_1,icr_combined_h,icr_combined_t,icr_combined_1) %>% round_df(., digits = 3)
  
  icr_fourway <- icr_fourway %>% dplyr::select(c(n_Units,Agreement,Holstis_CR, Krippendorffs_Alpha))
  colnames(icr_fourway) <- c(four_way_name_unit[i], four_way_name[i], four_way_holist[i], four_way_krip[i])
  
  icr_table <- bind_cols(icr_table,icr_fourway)
}

experts_long_sub <- experts_long %>% filter(unit_var %in% thre_way[[3]])
icr_experts_3 <- test_icr(experts_long_sub, unit_var = unit_var, coder_var = annotator, is_hs_tox)
appen_sub <- appen %>% filter(unit_var %in% thre_way[[3]])
icr_appen_3 <- test_icr(appen_sub, unit_var = unit_var, coder_var = annotator, is_hs_tox)
llianceF_sub <- ngo %>% filter(unit_var %in% thre_way[[3]])
icr_ngo_3 <- test_icr(ngo_sub, unit_var = unit_var, coder_var = annotator, is_hs_tox)
prolific_sub <- prolific %>% filter(unit_var %in% thre_way[[3]])
icr_proli_3 <- test_icr(prolific_sub, unit_var = unit_var, coder_var = annotator, is_hs_tox)
ras_sub <- ras %>% filter(unit_var %in% thre_way[[3]])
icr_reass_3 <- test_icr(ras_sub, unit_var = unit_var, coder_var = annotator, is_hs_tox)
citi_sub <- citi %>% filter(unit_var %in% thre_way[[3]])
icr_citi_3 <- test_icr(citi_sub, unit_var = unit_var, coder_var = annotator, is_hs_tox)
chatgpt_sub_n <- chatgpt_un_def_version %>% filter(unit_var %in% thre_way[[3]])
icr_chat_n_3 <- test_icr(chatgpt_sub_n, unit_var = unit_var, coder_var = annotator, is_hs_tox)
chatgpt_sub_s <- chatgpt_sensitivity_def_version %>% filter(unit_var %in% thre_way[[3]])
icr_chat_s_3 <- test_icr(chatgpt_sub_s, unit_var = unit_var, coder_var = annotator, is_hs_tox)
chatgpt_sub_m <- chatgpt_main_def_version %>% filter(unit_var %in% thre_way[[3]])
icr_chat_m_3 <- test_icr(chatgpt_sub_m, unit_var = unit_var, coder_var = annotator, is_hs_tox)
df_1_sub <- df_1 %>% filter(unit_var %in% thre_way[[3]])
icr_combined_3 <- test_icr(df_1_sub, unit_var = unit_var, coder_var = group, ishstox_combined) %>% mutate(Platform = "Combined ICR")

icr_threway <- dplyr::bind_rows(icr_experts_3,icr_reass_3,icr_ngo_3,icr_proli_3,
                                icr_appen_3,icr_citi_3,icr_chat_n_3,icr_chat_s_3,
                                icr_chat_m_3,icr_combined_3)

rm(icr_ngo,icr_ngo_1,icr_ngo_3,icr_ngo_h,icr_ngo_t,icr_appen,icr_appen_1,icr_appen_3,icr_appen_h,icr_appen_t,
   icr_citi,icr_citi_1,icr_citi_3,icr_citi_h,icr_citi_t,icr_proli,icr_proli_1,icr_proli_3,icr_proli_h,icr_proli_t,
   icr_experts,icr_experts_1,icr_experts_3,icr_experts_h,icr_experts_t)

icr_threway <- icr_threway %>% dplyr::select(c(n_Units,Agreement,Holstis_CR, Krippendorffs_Alpha))
colnames(icr_threway) <- c("n Units (Neither)", "Agreement (Neither)", "Holstis_CR (Neither)", "Krippendorffs_Alpha (Neither)")


icr_table$Variable <- rep(c("Hate Speech", "Toxic Speech", "Both together"), 10)

icr_table_3classes <- icr_table %>% filter(Variable == "Both together")
icr_table <- icr_table %>% filter(Variable != "Both together")

icr_table_3classes <- bind_cols(icr_table_3classes, icr_threway)
colnames(icr_table_3classes) <- c("Variable", "n_Units", "n_Coders", "n_Categories", "Level", "Agreement",
                                  "Holstis_CR", "Krippendorffs_Alpha", "Platform", "n_Units (Hate Speech)",
                                  "Agreement (Hate Speech)", "Holstis_CR (Hate Speech)", "Krippendorffs_Alpha (Hate Speech)",
                                  "n_Units (Toxic Speech)", "Agreement (Toxic Speech)", "Holstis_CR (Toxic Speech)", "Krippendorffs_Alpha (Toxic Speech)",
                                  "n_Units (Neither)", "Agreement (Neither)", "Holstis_CR (Neither)", "Krippendorffs_Alpha (Neither)")

stargazer::stargazer(icr_table, digits = 2, summary = F, out.header = F, out = "../img_gpt_4o/icr_hatespeech_table_full_3_classes.html", type = "html")
write_csv(icr_table, "../img_gpt_4o/icr_table_full_3_classes.csv")

stargazer::stargazer(icr_table_3classes, digits = 2, summary = F, out.header = F, out = "../img_gpt_4o/icr_hatespeech_table_full_3_classes_2.html", type = "html")
write_csv(icr_table_3classes, "../img_gpt_4o/icr_table_full_3_classes_2.csv")
################################################################################
# Accuracy for each group
################################################################################
df_2 <- df_1 %>% group_by(newuniqueid) %>%
  select(-c(titel,text,kommentar,difficult_case)) %>%
  pivot_wider(names_from = group,
              values_from = c(ishatespeech_1,ishatespeech_2,
                              ishatespeech_3,target_group_1,
                              target_group_3,target_group_2,
                              istoxic_1,istoxic_2,istoxic_3,
                              is_hs_tox_1,is_hs_tox_2,is_hs_tox_3,
                              ishate_combined,
                              target_combined,
                              istoxic_combined,
                              ishstox_combined))

names(df_2)
# List of groups
group_vars <- c("experts", "ra", "ngo", "appen", "Prolific", "citi", 
                "chatGPT (United Nations Definition)", "chatGPT (Sensitivity Definition)", "chatGPT")

# Apply transformation to all groups
# Make sure this hs the same layout as the rest (since for the sensitivity check we do it the same way as with finetuning so toxic is a class from the beginning so if it is toxic hs is 0 which needs to be changed!)
df_2 <- df_2 %>% mutate(`ishate_combined_chatGPT (Sensitivity Definition)` = ifelse(`istoxic_combined_chatGPT (Sensitivity Definition)` == 1, 1, `ishate_combined_chatGPT (Sensitivity Definition)`))

df_2 <- df_2 %>%
  mutate(across(
    all_of(paste0("ishate_combined_", group_vars)), 
    ~ ifelse(. == 1 & get(paste0("istoxic_combined_", sub("ishate_combined_", "", cur_column()))) == 0, 1,
             ifelse(. == 1 & get(paste0("istoxic_combined_", sub("ishate_combined_", "", cur_column()))) == 1, 2, 0)),
    .names = "is_hatetox_combined_{.col}"
  )) %>%
  rename_with(~ sub("ishate_combined_", "", .), starts_with("is_hatetox_combined_"))  # Clean column names



# Recodf_2# Recoding the predictions and base group annotations
group_cols <- c("is_hatetox_combined_ra","is_hatetox_combined_ngo", "is_hatetox_combined_citi", 
                "is_hatetox_combined_Prolific", "is_hatetox_combined_appen", 
                "is_hatetox_combined_chatGPT (Sensitivity Definition)", "is_hatetox_combined_chatGPT (United Nations Definition)", 
                "is_hatetox_combined_chatGPT")

group_cols_gpt_reduce <- c("is_hatetox_combined_ra","is_hatetox_combined_ngo", "is_hatetox_combined_citi", 
                           "is_hatetox_combined_Prolific", "is_hatetox_combined_appen", "is_hatetox_combined_chatGPT")


base_group = "is_hatetox_combined_experts"

#Get examples where all agree and where all disagree! 
# Identify rows where all groups agree with the base group
df_2$all_agree <- rowSums(df_2[group_cols] == df_2[[base_group]]) == length(group_cols)
# Identify rows where none of the groups agree with the base group
df_2$none_agree <- rowSums(df_2[group_cols] != df_2[[base_group]]) == length(group_cols_gpt_reduce)


df_samples_all_agree <- df_2 %>% filter(all_agree == T)
df_samples_all_agree <- citi_wide %>% filter(articleid %in% df_samples_all_agree$articleid) %>% select(c(newuniqueid,kommentar,titel,text,ishate_combined)) %>% group_by(ishate_combined) %>% dplyr::slice_sample(n = 5) %>% rename(ishate_experts = ishate_combined)
write_csv(df_samples_all_agree, "../img_gpt_4o/examples_where_all_agree_3_classes.csv")

df_samples_none_agree <- df_2 %>% filter(none_agree == T)
df_samples_none_agree <- experts_wide %>% filter(articleid %in% df_samples_none_agree$articleid) %>% select(c(newuniqueid,kommentar,titel,text,ishate_combined)) %>% group_by(ishate_combined) %>% dplyr::slice_sample(n = 5) %>% rename(ishate_experts = ishate_combined)
write_csv(df_samples_none_agree, "../img_gpt_4o/examples_where_none_agree_3_classes.csv")



friendly_names <- c("Research Assistants", "NGO", "Citizen Science", "Prolific", "Appen", "chatGPT (Sensitivity Definition)", "chatGPT (United Nations Definition)", "chatGPT")

conf_df_a <- df_2 %>%
  mutate(across(all_of(c(group_cols, base_group)),
                ~ifelse(. == 1, "Hate", 
                        ifelse(. == 2, "Toxic", "Neither"))))

# Placeholder for bootstrapped CI calculations
calculate_boot_ci <- function(data, base_group, pred_col) {
  # Define bootstrapping statistic function
  boot_stat_func <- function(data, indices) {
    # Extract bootstrap sample
    sample <- data[indices, ]
    # Calculate metrics
    
    
    # Fixes issue with Prolific (will introduce minimal bias but otherwise there will be no results)
    # Identify missing levels in pred_col
    # Define all possible classes
    all_classes <- c("Hate", "Toxic", "Neither")
    
    # Identify missing classes in base_group and pred_col
    missing_in_base_group <- setdiff(all_classes, unique(sample[[base_group]]))
    missing_in_pred_col <- setdiff(all_classes, unique(sample[[pred_col]]))
    #k <- 1
    
    # Add dummy rows for missing classes
    for (base_class in missing_in_base_group) {
      for (pred_class in all_classes) {
        dummy_row <- sample[1, ]  # Create a template row
        dummy_row[[base_group]] <- base_class
        dummy_row[[pred_col]] <- pred_class
        sample <- rbind(sample, dummy_row)
        #cat(paste0("Added Row: ",k, "\n"))
        #k <- k + 1
      }
    }
    
    for (pred_class in missing_in_pred_col) {
      for (base_class in all_classes) {
        dummy_row <- sample[1, ]  # Create a template row
        dummy_row[[base_group]] <- base_class
        dummy_row[[pred_col]] <- pred_class
        sample <- rbind(sample, dummy_row)
        #cat(paste0("Added Row: ",k, "\n"))
        #k <- k + 1
      }
    }
    
    # Evaluate using cvms::evaluate
    eval_res <- cvms::evaluate(
      data = sample,
      target_col = base_group,
      prediction_cols = pred_col,
      type = "multinomial"
    )
    
    # Return the metrics
    return(with(eval_res, c(`Overall Accuracy` = `Overall Accuracy`, `Balanced Accuracy` = `Balanced Accuracy`, F1 = F1, Recall = Sensitivity, Precision = `Pos Pred Value`)))
  }
  
  # Perform bootstrapping
  boot_res <- boot(data, boot_stat_func, R = 1000, parallel = "multicore", ncpus = 8)
  
  # Calculate confidence intervals for each metric
  ci_list <- lapply(1:ncol(boot_res$t), function(i) {
    boot.ci(boot_res, type = "perc", index = i)$percent[4:5]
  })
  names(ci_list) <- c("Overall Accuracy", "Balanced Accuracy", "F1", "Sensitivity", "PosPredValue")
  
  return(ci_list)
}

# Initialize the list for storing evaluation results and CIs
conf_mat_no_fine_tuning <- list()
result_df <- NULL

# Loop over each group to evaluate and calculate CIs
for (group_col in group_cols) {
  # Evaluate metrics using cvms::evaluate
  tmp <- conf_df_a %>% ungroup %>%  select(base_group,group_col)
  eval_results <- cvms::evaluate(tmp,
                                 prediction_col = group_col,
                                 target_col = base_group,
                                 type = "multinomial")
  
  # Store evaluation results
  conf_mat_no_fine_tuning[[group_col]] <- eval_results
  
  # Calculate CIs using bootstrapping
  ci_results <- calculate_boot_ci(tmp, base_group, group_col)

  # Store results with CIs
  row_tmp <- c(group_col,eval_results$`Overall Accuracy`,eval_results$`Balanced Accuracy`, eval_results$F1, eval_results$Sensitivity, eval_results$`Pos Pred Value`,
               ci_results$`Overall Accuracy`[1], ci_results$`Overall Accuracy`[2], ci_results$`Balanced Accuracy`[1], ci_results$`Balanced Accuracy`[2], 
               ci_results$F1[1], ci_results$F1[2],ci_results$Sensitivity[1], ci_results$Sensitivity[2], ci_results$PosPredValue[1], ci_results$PosPredValue[2])
  
  # Define the names for each component of the row vector
  metric_names <- c("Group","Overall Accuracy", "Balanced Accuracy", "F1", "Sensitivity", "Precision",
                    "OverallAccuracyLowerCI", "OverallAccuracyUpperCI", 
                    "BalancedAccuracyLowerCI", "BalancedAccuracyUpperCI", 
                    "F1LowerCI", "F1UpperCI",
                    "SensitivityLowerCI", "SensitivityUpperCI", 
                    "PrecisionLowerCI", "PrecisionUpperCI")
  
  # Assign names to the row vector
  names(row_tmp) <- metric_names
  
  result_df <- dplyr::bind_rows(result_df, row_tmp)
}

# Setting friendly names for the groups in the results
names(conf_mat_no_fine_tuning) <- friendly_names
result_df$Group <- friendly_names
# Add a 'Type' column to the original DataFrame with "human labels"
result_df$Type <- "Accuracy of Human\nAnnotations"
################################################################################
# Evaluate the finetuned models!
################################################################################
# Recode Hate Speech & Toxic Speech to Hate Speech simple Binary Distinction! 
# List of dataset names
datasets <- c("expert_df_100", "ngo_df_100", "appen_df_100", "citi_df_100", "prolific_df_100", "ras_df_100", "gpt_df_100",
              "expert_df_250", "ngo_df_250", "appen_df_250", "citi_df_250", "prolific_df_250", "ras_df_250", "gpt_df_250")

# Function to recode the 'prediction' column
recode_prediction <- function(df) {
  df$prediction <- ifelse(df$prediction == "HATE SPEECH", "HATE",
                          ifelse(df$prediction == "TOXIC SPEECH", "TOXIC", "NEITHER"))
  return(df)
}

# Apply the function to each dataset
for (dataset in datasets) {
  assign(dataset, recode_prediction(get(dataset)))
}


df_list <- list(expert_df_100,ngo_df_100,appen_df_100,citi_df_100,prolific_df_100,ras_df_100,gpt_df_100,
                expert_df_250,ngo_df_250,appen_df_250,citi_df_250,prolific_df_250,ras_df_250,gpt_df_250)


friendly_names <-  c("Experts 100", "NGO 100", "Appen 100", "Citizen Science 100", "Prolific 100", "Research Assistants 100", "chatGPT 100",
                     "Experts 250", "NGO 250", "Appen 250", "Citizen Science 250", "Prolific 250", "Research Assistants 250", "chatGPT 250")

# Placeholder for bootstrapped CI calculations
calculate_boot_ci <- function(data) {
  # Define bootstrapping statistic function
  boot_stat_func <- function(data, indices) {
    # Extract bootstrap sample
    sample <- data[indices, ]
    
    
    # Fixes issue with Prolific (will introduce minimal bias but otherwise there will be no results)
    # Identify missing levels in pred_col
    # Define all possible classes
    all_classes <- c("2", "1", "0")
    
    # Identify missing classes in base_group and pred_col
    missing_in_prediction <- setdiff(all_classes, unique(sample[["prediction"]]))
    missing_in_label_column <- setdiff(all_classes, unique(sample[["label_column"]]))
    
    # Add dummy rows for missing classes
    for (base_class in missing_in_label_column) {
      for (pred_class in all_classes) {
        dummy_row <- sample[1, ]  # Create a template row
        dummy_row[["label_column"]] <- base_class
        dummy_row[["prediction"]] <- pred_class
        sample <- rbind(sample, dummy_row)
      }
    }
    
    for (pred_class in missing_in_prediction) {
      for (base_class in all_classes) {
        dummy_row <- sample[1, ]  # Create a template row
        dummy_row[["label_column"]] <- base_class
        dummy_row[["prediction"]] <- pred_class
        sample <- rbind(sample, dummy_row)
      }
    }
    
    # Calculate metrics
    eval_res <- cvms::evaluate(sample,
                               prediction_col= "prediction",
                               target_col = "label_column",
                               type = "multinomial")
    # Return the metrics
    return(with(eval_res, c(`Overall Accuracy` = `Overall Accuracy`, `Balanced Accuracy` = `Balanced Accuracy`, F1 = F1, Recall = Sensitivity, Precision = `Pos Pred Value`)))
  }
  
  # Perform bootstrapping
  boot_res <- boot(data, boot_stat_func, R = 1000, parallel = "multicore", ncpus = 8)
  
  # Calculate confidence intervals for each metric
  ci_list <- lapply(1:ncol(boot_res$t), function(i) {
    boot.ci(boot_res, type = "perc", index = i)$percent[4:5]
  })
  names(ci_list) <- c("Overall Accuracy", "Balanced Accuracy", "F1", "Sensitivity", "PosPredValue")
  
  return(ci_list)
}

# Initialize the list for storing evaluation results and CIs
conf_mat_with_fine_tuning <- list()
result_df_fine <- NULL

# Loop over each group to evaluate and calculate CIs
for (i in 1:length(df_list)) {
  # Evaluate metrics using cvms::evaluate
  tmp <- df_list[[i]] %>% 
    mutate(prediction = ifelse(prediction == "HATE", 1,
                               ifelse(prediction == "TOXIC", 2, 0)))
  tmp$prediction <- as.character(tmp$prediction)
  tmp$label_column <- as.character(tmp$label_column)
  
  eval_results <- cvms::evaluate(tmp,
                                 prediction_col = "prediction",
                                 target_col = "label_column",
                                 type = "multinomial")
  
  # Store evaluation results
  conf_mat_with_fine_tuning[[friendly_names[i]]] <- eval_results
  
  # Calculate CIs using bootstrapping
  ci_results_fine <- calculate_boot_ci(tmp)
  
  # Store results with CIs
  row_tmp <- c(friendly_names[i],eval_results$`Overall Accuracy`,eval_results$`Balanced Accuracy`, eval_results$F1, eval_results$Sensitivity, eval_results$`Pos Pred Value`,
               ci_results_fine$`Overall Accuracy`[1], ci_results_fine$`Overall Accuracy`[2], ci_results_fine$`Balanced Accuracy`[1], ci_results_fine$`Balanced Accuracy`[2], 
               ci_results_fine$F1[1], ci_results_fine$F1[2],ci_results_fine$Sensitivity[1], ci_results_fine$Sensitivity[2], ci_results_fine$PosPredValue[1], ci_results_fine$PosPredValue[2])
  
  # Define the names for each component of the row vector
  metric_names <- c("Group","Overall Accuracy", "Balanced Accuracy", "F1", "Sensitivity", "Precision",
                    "OverallAccuracyLowerCI", "OverallAccuracyUpperCI", 
                    "BalancedAccuracyLowerCI", "BalancedAccuracyUpperCI", 
                    "F1LowerCI", "F1UpperCI",
                    "SensitivityLowerCI", "SensitivityUpperCI", 
                    "PrecisionLowerCI", "PrecisionUpperCI")
  
  # Assign names to the row vector
  names(row_tmp) <- metric_names
  
  result_df_fine <- dplyr::bind_rows(result_df_fine, row_tmp)
}

Type = rep(c("GPT-4o-mini Accuracy after\nfine-tuning on 100 labels", "GPT-4o-mini Accuracy after\nfine-tuning on 250 labels"), each = 7)
result_df_fine$Type <- Type


# Setting friendly names for the groups in the results
names(conf_mat_with_fine_tuning) <- friendly_names


# Combine the two DataFrames
result_df <- dplyr::bind_rows(result_df, result_df_fine)
result_df <- result_df %>% mutate(Group = gsub("\\s100|\\s250", "", Group)) %>%
  mutate(across(.cols = 2:16, .fns = as.numeric))

result_df

# Impute NaN from Simulations!
result_df$F1 <- ifelse(is.na(result_df$F1) == T, ((result_df$F1LowerCI + result_df$F1UpperCI)/2), result_df$F1)
result_df$Precision <- ifelse(is.na(result_df$Precision) == T, ((result_df$PrecisionLowerCI + result_df$PrecisionUpperCI)/2), result_df$Precision)

write_csv(result_df, "../img_gpt_4o/classification_metrics_all_3_classes.csv")
################################################################################
# Figure accuracy for each group
################################################################################
# Filter out 'chatGPT' from the DataFrame and get its accuracy
chatGPT_acc <- result_df %>% dplyr::filter(Group %in% c("chatGPT") & Type == "Accuracy of Human\nAnnotations")
chatGPT_lower <- as.numeric(chatGPT_acc$OverallAccuracyLowerCI)
chatGPT_upper <- as.numeric(chatGPT_acc$OverallAccuracyUpperCI)
chatGPT_acc <- as.numeric(chatGPT_acc$`Overall Accuracy`)


filtered_results_df <- subset(result_df, Group != "chatGPT")
filtered_results_df <- subset(filtered_results_df, Group != "chatGPT (Sensitivity Definition)")
filtered_results_df <- subset(filtered_results_df, Group != "chatGPT (United Nations Definition)")
filtered_results_df <- subset(filtered_results_df, Group != "Experts")


if(FALSE){
  # Adjust the levels of 'Type' so that human annotations come first
  filtered_results_df$Type <- factor(filtered_results_df$Type, 
                                     levels = c("GPT-4o-mini Accuracy after\nfine-tuning on 250 labels",
                                                "GPT-4o-mini Accuracy after\nfine-tuning on 100 labels",
                                                "Accuracy of Human\nAnnotations"))
  
}

if(FALSE){
  # Convert 'Group' to an ordered factor based on mean accuracy
  group_order <- filtered_results_df %>% 
    group_by(Group) %>% 
    summarise(mean_acc = mean(Accuracy)) %>% 
    arrange(-mean_acc) %>% 
    pull(Group)
  
  filtered_results_df$Group <- factor(filtered_results_df$Group, levels = group_order)
}


if(FALSE){
  # First, filter out only the human annotation data
  human_annotations <- filtered_results_df %>% 
    filter(Type == "Accuracy of Human\nAnnotations") %>%
    arrange(desc(`Overall Accuracy`))
  
  # Now, set the levels of the 'Group' factor based on the order in human_annotations
  filtered_results_df$Group <- factor(filtered_results_df$Group, levels = human_annotations$Group)
}

if(TRUE){
  # First, filter out only the human annotation data
  human_annotations <- filtered_results_df %>% 
    filter(Type == "Accuracy of Human\nAnnotations") %>%
    arrange(desc(`Overall Accuracy`))
  
  # Now, set the levels of the 'Group' factor based on the order in human_annotations
  filtered_results_df$Group <- factor(filtered_results_df$Group, levels = c("Research Assistants", "Prolific", "Citizen Science", "Appen", "NGO"))
}


# Plot
pl2 <- filtered_results_df %>% filter(Group != "Experts") %>%
  ggplot(aes(x = Group, y = `Overall Accuracy`, fill = Type)) +
  geom_rect(aes(xmin = -Inf, xmax = 5.5, ymin = chatGPT_lower, ymax = chatGPT_upper), 
            fill = "lightblue", alpha = 0.2) +
  geom_bar(stat = "identity", position = position_dodge2(width = 1, padding = 0.1)) +
  geom_errorbar(aes(ymin = OverallAccuracyLowerCI, ymax = OverallAccuracyUpperCI, group = interaction(Group, Type)), 
                stat = "identity",
                width = .5, alpha = 0.9,
                position = position_dodge(width = 0.9)) +
  geom_segment(aes(x = -Inf, xend = 5.5, y = chatGPT_acc, yend = chatGPT_acc), 
               color = "black", 
               linewidth = 0.66, 
               alpha = 0.75, 
               lineend = "round") +
  geom_segment(aes(x = -Inf, xend = 5.5, y = chatGPT_lower, yend = chatGPT_lower), 
               color = "black", 
               linewidth = 0.33,
               linetype = "dotted",
               alpha = 0.75, 
               lineend = "round") +
  geom_segment(aes(x = -Inf, xend = 5.5, y = chatGPT_upper, yend = chatGPT_upper), 
               color = "black", 
               linewidth = 0.33,
               linetype = "dotted",
               alpha = 0.75, 
               lineend = "round") +
  labs(y = "Accuracy [%]", x = "", fill = "") +
  annotate("text", x = 5.65, y = chatGPT_acc, label = paste("GPT-4o: ", scales::percent(chatGPT_acc)), hjust = .5, color = "black", fontface = "bold", size = 6) +
  #annotate("text", x = 5.65, y = chatGPT_acc, label = paste("GPT-4o-mini: ", round(chatGPT_acc, digits = 2)), hjust = .5, color = "black", fontface = "bold", size = 6) +
  scale_y_continuous(labels = scales::percent_format(), expand = c(0,0), limits = c(0,1.02), breaks = seq(0,1, by = 0.1)) +
  #scale_y_continuous(expand = c(0,0), limits = c(0,1.02), breaks = seq(0,1, by = 0.1)) +
  scale_x_discrete(expand = expand_scale(mult = c(0.1, 0.2))) +
  scale_fill_brewer(palette = "Dark2") +
  coord_flip() +
  theme_classic() +
  theme(legend.position = "bottom",
        axis.text.y = element_text(size = 18, face = "bold", color = "black"),
        axis.text.x = element_text(size = 14, color = "black"),
        axis.title.x = element_text(size = 18, color = "black", face = "bold"),
        legend.text = element_text(size = 14),
        plot.margin = unit(c(2, .5, .5, .5), "lines"))

# Show plot
pl2

# Save plot
ggsave(plot = pl2, filename = "../img_gpt_4o/accuracy_groups_vs_experts_with_types_refined_and_baseline_3_classes.pdf", width = 12, height = 9, dpi = 300, device = cairo_pdf)
ggsave(plot = pl2, filename = "../img_gpt_4o/accuracy_groups_vs_experts_with_types_refined_and_baseline_3_classes.png", width = 12, height = 9, dpi = 300, bg = "White")

################################################################################
# Confusion Matrix
################################################################################
# Filter out 'chatGPT' from the DataFrame and get its accuracy
result_df <- result_df %>% mutate(Type = ifelse(Type == "Accuracy of Human\nAnnotations", "Human\nAnnotations",
                                                ifelse(Type == "GPT-4o-mini Accuracy after\nfine-tuning on 100 labels", "after\nfine-tuning on 100 labels", "after\nfine-tuning on 250 labels")))
chatGPT_pre <- result_df %>% dplyr::filter(Group == "chatGPT" & Type == "Human\nAnnotations")
chatGPT_pre_lower <- as.numeric(chatGPT_pre$PrecisionLowerCI)
chatGPT_pre_upper <- as.numeric(chatGPT_pre$PrecisionUpperCI)
chatGPT_pre <- as.numeric(chatGPT_pre$Precision)

filtered_results_df <- subset(result_df, Group != "chatGPT")
filtered_results_df <- subset(filtered_results_df, Group != "chatGPT (Sensitivity Definition)")
filtered_results_df <- subset(filtered_results_df, Group != "chatGPT (United Nations Definition)")
filtered_results_df <- subset(filtered_results_df, Group != "Experts")

if(FALSE){
  # First, filter out only the human annotation data
  human_annotations <- filtered_results_df %>% 
    filter(Type == "Human\nAnnotations") %>%
    arrange(desc(F1))
  
  # Now, set the levels of the 'Group' factor based on the order in human_annotations
  filtered_results_df$Group <- factor(filtered_results_df$Group, levels = human_annotations$Group)
}

if(TRUE){
  # First, filter out only the human annotation data
  human_annotations <- filtered_results_df %>% 
    filter(Type == "Human\nAnnotations") %>%
    arrange(desc(F1))
  
  # Now, set the levels of the 'Group' factor based on the order in human_annotations
  filtered_results_df$Group <- factor(filtered_results_df$Group, levels = c("Research Assistants", "Prolific", "Citizen Science", "Appen", "NGO"))
}

# Plot
pl3 <- filtered_results_df %>% dplyr::mutate(Type = ifelse(Type == "Human\nAnnotations", "Precision of Human\nAnnotations", paste0("GPT-4o Precision ",Type))) %>%
  dplyr::mutate(Type = factor(Type, 
                              levels = c("Precision of Human\nAnnotations",
                                         "GPT-4o Precision after\nfine-tuning on 100 labels",
                                         "GPT-4o Precision after\nfine-tuning on 250 labels"))) %>%
  dplyr::filter(Group != "chatGPT") %>% 
  ggplot(aes(x = Group, y = Precision, fill = Type)) +
  geom_rect(aes(xmin = -Inf, xmax = 5.5, ymin = chatGPT_pre_lower , ymax = chatGPT_pre_upper), 
            fill = "lightblue", alpha = 0.2) +
  geom_bar(stat = "identity", position = position_dodge2(width = 0.9, padding = 0.1)) +
  geom_errorbar(aes(ymin = PrecisionLowerCI, ymax = PrecisionUpperCI, group = interaction(Group, Type)), 
                stat = "identity",
                width = .5, alpha = 0.9,
                position = position_dodge(width = 0.9)) +
  geom_segment(aes(x = -Inf, xend = 5.5, y = chatGPT_pre, yend = chatGPT_pre), 
               color = "black", 
               linewidth = 0.66, 
               alpha = 0.75, 
               lineend = "round") +
  geom_segment(aes(x = -Inf, xend = 5.5, y = chatGPT_pre_lower, yend = chatGPT_pre_lower), 
               color = "black", 
               linewidth = 0.33,
               linetype = "dotted",
               alpha = 0.75, 
               lineend = "round") +
  geom_segment(aes(x = -Inf, xend = 5.5, y = chatGPT_pre_upper, yend = chatGPT_pre_upper), 
               color = "black", 
               linewidth = 0.33,
               linetype = "dotted",
               alpha = 0.75, 
               lineend = "round") +
  labs(y = "Precision", x = "", fill = "") +
  #annotate("text", x = 5.60, y = chatGPT_precision, label = paste("GPT-4o-mini: ", scales::percent(chatGPT_precision)), hjust = .5, color = "black", fontface = "bold", size = 6) +
  annotate("text", x = 5.60, y = chatGPT_pre, label = paste("GPT-4o: ", round(chatGPT_pre, digits = 2)), hjust = .5, color = "black", fontface = "bold", size = 6) +
  #scale_y_continuous(labels = scales::percent_format(), expand = c(0,0), limits = c(0,1.02), breaks = seq(0,1, by = 0.1)) +
  scale_y_continuous(expand = c(0,0), limits = c(0,1.02), breaks = seq(0,1, by = 0.1)) +
  scale_x_discrete(expand = expand_scale(mult = c(0.1, 0.2))) +
  scale_fill_brewer(palette = "Dark2") +
  coord_flip() +
  theme_classic() +
  theme(legend.position = "bottom",
        axis.text.y = element_text(size = 16, face = "bold", color = "black"),
        axis.text.x = element_text(size = 12, color = "black"),
        axis.title.x = element_text(size = 16, color = "black", face = "bold"),
        legend.text = element_text(size = 12),
        plot.margin = unit(c(2, .5, .5, .5), "lines"))

# Show plot
pl3


chatGPT_rec <- result_df %>% dplyr::filter(Group == "chatGPT" & Type == "Human\nAnnotations")
chatGPT_rec_lower <- as.numeric(chatGPT_rec$SensitivityLowerCI)
chatGPT_rec_upper <- as.numeric(chatGPT_rec$SensitivityUpperCI)
chatGPT_rec <- as.numeric(chatGPT_rec$Sensitivity)


filtered_results_df <- subset(result_df, Group != "chatGPT")
filtered_results_df <- subset(filtered_results_df, Group != "chatGPT (Sensitivity Definition)")
filtered_results_df <- subset(filtered_results_df, Group != "chatGPT (United Nations Definition)")
filtered_results_df <- subset(filtered_results_df, Group != "Experts")

if(FALSE){
  # First, filter out only the human annotation data
  human_annotations <- filtered_results_df %>% 
    filter(Type == "Human\nAnnotations") %>%
    arrange(desc(F1))
  
  # Now, set the levels of the 'Group' factor based on the order in human_annotations
  filtered_results_df$Group <- factor(filtered_results_df$Group, levels = human_annotations$Group)
}

if(TRUE){
  # First, filter out only the human annotation data
  human_annotations <- filtered_results_df %>% 
    filter(Type == "Human\nAnnotations") %>%
    arrange(desc(F1))
  
  # Now, set the levels of the 'Group' factor based on the order in human_annotations
  filtered_results_df$Group <- factor(filtered_results_df$Group, levels = c("Research Assistants", "Prolific", "Citizen Science", "Appen", "NGO"))
}

# Plot
pl4 <- filtered_results_df %>% dplyr::mutate(Type = ifelse(Type == "Human\nAnnotations", "Recall of Human\nAnnotations", paste0("GPT-4o Recall ",Type))) %>%
  dplyr::mutate(Type = factor(Type, 
                              levels = c("Recall of Human\nAnnotations",
                                         "GPT-4o Recall after\nfine-tuning on 100 labels",
                                         "GPT-4o Recall after\nfine-tuning on 250 labels"))) %>%
  dplyr::filter(Group != "chatGPT") %>% 
  ggplot(aes(x = Group, y = Sensitivity, fill = Type)) +
  geom_rect(aes(xmin = -Inf, xmax = 5.5, ymin = chatGPT_rec_lower , ymax = chatGPT_rec_upper), 
            fill = "lightblue", alpha = 0.2) +
  geom_bar(stat = "identity", position = position_dodge2(width = 0.9, padding = 0.1)) +
  geom_errorbar(aes(ymin = SensitivityLowerCI, ymax = SensitivityUpperCI, group = interaction(Group, Type)), 
                stat = "identity",
                width = .5, alpha = 0.9,
                position = position_dodge(width = 0.9)) +
  geom_segment(aes(x = -Inf, xend = 5.4, y = chatGPT_rec, yend = chatGPT_rec), 
               color = "black", 
               linewidth = .66, 
               alpha = 0.75, 
               lineend = "round") +
  geom_segment(aes(x = -Inf, xend = 5.5, y = chatGPT_rec_lower, yend =chatGPT_rec_lower), 
               color = "black", 
               linewidth = 0.33,
               linetype = "dotted",
               alpha = 0.75, 
               lineend = "round") +
  geom_segment(aes(x = -Inf, xend = 5.5, y = chatGPT_rec_upper, yend = chatGPT_rec_upper), 
               color = "black", 
               linewidth = 0.33,
               linetype = "dotted",
               alpha = 0.75, 
               lineend = "round") +
  labs(y = "Recall", x = "", fill = "") +
  #annotate("text", x = 5.60, y = chatGPT_recall, label = paste("GPT-4o-mini: ", scales::percent(chatGPT_recall)), hjust = .5, color = "black", fontface = "bold", size = 6) +
  annotate("text", x = 5.60, y = chatGPT_rec, label = paste("GPT-4o: ", round(chatGPT_rec, digits = 2)), hjust = .5, color = "black", fontface = "bold", size = 6) +
  #scale_y_continuous(labels = scales::percent_format(), expand = c(0,0), limits = c(0,1.02), breaks = seq(0,1, by = 0.1)) +
  scale_y_continuous(expand = c(0,0), limits = c(0,1.02), breaks = seq(0,1, by = 0.1)) +
  scale_x_discrete(expand = expand_scale(mult = c(0.1, 0.2))) +
  scale_fill_brewer(palette = "Dark2") +
  coord_flip() +
  theme_classic() +
  theme(legend.position = "bottom",
        axis.text.y = element_text(size = 16, face = "bold", color = "black"),
        axis.text.x = element_text(size = 12, color = "black"),
        axis.title.x = element_text(size = 16, color = "black", face = "bold"),
        legend.text = element_text(size = 12),
        plot.margin = unit(c(2, .5, .5, .5), "lines"))

# Show plot
pl4

chatGPT_f1 <- result_df %>% dplyr::filter(Group == "chatGPT" & Type == "Human\nAnnotations")
chatGPT_f1_lower <- as.numeric(chatGPT_f1$F1LowerCI)
chatGPT_f1_upper <- as.numeric(chatGPT_f1$F1UpperCI)
chatGPT_f1 <- as.numeric(chatGPT_f1$F1)


filtered_results_df <- subset(result_df, Group != "chatGPT")
filtered_results_df <- subset(filtered_results_df, Group != "chatGPT (Sensitivity Definition)")
filtered_results_df <- subset(filtered_results_df, Group != "chatGPT (United Nations Definition)")
filtered_results_df <- subset(filtered_results_df, Group != "Experts")


if(FALSE){
  # First, filter out only the human annotation data
  human_annotations <- filtered_results_df %>% 
    filter(Type == "Human\nAnnotations") %>%
    arrange(desc(F1))
  
  # Now, set the levels of the 'Group' factor based on the order in human_annotations
  filtered_results_df$Group <- factor(filtered_results_df$Group, levels = human_annotations$Group)
}

if(TRUE){
  # First, filter out only the human annotation data
  human_annotations <- filtered_results_df %>% 
    filter(Type == "Human\nAnnotations") %>%
    arrange(desc(F1))
  
  # Now, set the levels of the 'Group' factor based on the order in human_annotations
  filtered_results_df$Group <- factor(filtered_results_df$Group, levels = c("Research Assistants", "Prolific", "Citizen Science", "Appen", "NGO"))
}

# Plot
pl5 <- filtered_results_df %>% dplyr::mutate(Type = ifelse(Type == "Human\nAnnotations", "F1-Score of Human\nAnnotations", paste0("GPT-4o F1-Score ",Type))) %>%
  dplyr::mutate(Type = factor(Type, 
                              levels = c("F1-Score of Human\nAnnotations",
                                         "GPT-4o F1-Score after\nfine-tuning on 100 labels",
                                         "GPT-4o F1-Score after\nfine-tuning on 250 labels"))) %>%
  dplyr::filter(Group != "chatGPT") %>% 
  ggplot(aes(x = Group, y = F1, fill = Type)) +
  geom_rect(aes(xmin = -Inf, xmax = 5.5, ymin = chatGPT_f1_lower, ymax = chatGPT_f1_upper), 
            fill = "lightblue", alpha = 0.2) +
  geom_bar(stat = "identity", position = position_dodge2(width = 0.9, padding = 0.1)) +
  geom_errorbar(aes(ymin = F1LowerCI, ymax = F1UpperCI, group = interaction(Group, Type)), 
                stat = "identity",
                width = .5, alpha = 0.9,
                position = position_dodge(width = 0.9)) +
  geom_segment(aes(x = -Inf, xend = 5.4, y = chatGPT_f1, yend = chatGPT_f1), 
               color = "black", 
               linewidth = .66, 
               alpha = 0.75, 
               lineend = "round") +
  geom_segment(aes(x = -Inf, xend = 5.5, y = chatGPT_f1_lower, yend = chatGPT_f1_lower), 
               color = "black", 
               linewidth = 0.33,
               linetype = "dotted",
               alpha = 0.75, 
               lineend = "round") +
  geom_segment(aes(x = -Inf, xend = 5.5, y = chatGPT_f1_upper, yend = chatGPT_f1_upper), 
               color = "black", 
               linewidth = 0.33,
               linetype = "dotted",
               alpha = 0.75, 
               lineend = "round") +
  labs(y = "F1-Score", x = "", fill = "") +
  #annotate("text", x = 5.60, y = chatGPT_f1, label = paste("GPT-4o-mini: ", scales::percent(chatGPT_f1)), hjust = .5, color = "black", fontface = "bold", size = 6) +
  #scale_y_continuous(labels = scales::percent_format(), expand = c(0,0), limits = c(0,1.02), breaks = seq(0,1, by = 0.1)) +
  annotate("text", x = 5.60, y = chatGPT_f1, label = paste("GPT-4o: ", round(chatGPT_f1, digits = 2)), hjust = .5, color = "black", fontface = "bold", size = 6) +
  scale_y_continuous(expand = c(0,0), limits = c(0,1.02), breaks = seq(0,1, by = 0.1)) +
  scale_x_discrete(expand = expand_scale(mult = c(0.1, 0.2))) +
  scale_fill_brewer(palette = "Dark2") +
  coord_flip() +
  theme_classic() +
  theme(legend.position = "bottom",
        axis.text.y = element_text(size = 16, face = "bold", color = "black"),
        axis.text.x = element_text(size = 12, color = "black"),
        axis.title.x = element_text(size = 16, color = "black", face = "bold"),
        legend.text = element_text(size = 12),
        plot.margin = unit(c(2, .5, .5, .5), "lines"))

# Show plot
pl5

combined_plot <- cowplot::plot_grid(
  pl5, pl3, pl4, ncol = 1   
)

# Show the combined plot
print(combined_plot)

# Save plot
ggsave(plot = combined_plot, filename = "../img_gpt_4o/accuracy_groups_vs_experts_with_types_refined_and_baseline_facets_3_classes.pdf", width = 12, height = 16, dpi = 300, device = cairo_pdf)
ggsave(plot = combined_plot, filename = "../img_gpt_4o/accuracy_groups_vs_experts_with_types_refined_and_baseline_facets_3_classes.png", width = 12, height = 16, dpi = 300, bg = "White")
################################################################################
# Confusion Table all Finetuning Models and ZeroShot GPT
################################################################################
# Function to extract and flatten a 3x3 confusion matrix with meaningful labels
extract_flat_cm <- function(model) {
  cm <- model[["Confusion Matrix"]][[1]]
  
  cm <- cm %>% mutate(Prediction = ifelse(Prediction == "Hate", "1", 
                                          ifelse(Prediction == "Toxic", "2", 
                                                 ifelse(Prediction == "Neither", "0", Prediction))),
                      Target = ifelse(Target == "Hate", "1",
                                      ifelse(Target == "Toxic", "2",
                                             ifelse(Target == "Neither", "0", Target))))
  
  cm <- cm %>%
    arrange(
      grepl("\\D", Target), as.numeric(Target),
      grepl("\\D", Prediction), as.numeric(Prediction)
    )
  # Pivot the tibble into wide format so rows are Prediction and columns are Target
  cm_wide <- cm %>%
    pivot_wider(names_from = Target, values_from = N, values_fill = list(N = 0)) %>%
    arrange(Prediction)
  
  # Convert to a matrix (dropping the 'Prediction' column) and set row names accordingly
  mat <- as.matrix(cm_wide[,-1])
  rownames(mat) <- cm_wide$Prediction
  
  # Ensure columns are in numerical order (if they are numbers stored as characters)
  mat <- mat[, order(as.numeric(colnames(mat)))]
  
  # Flatten the matrix in row-major order by transposing first
  flat <- as.vector(t(mat))
  
  # Create descriptive names for each cell:
  # Diagonals: TP (class); off-diagonals: FP (predicted)/FN (actual)
  new_names <- c("TP (0)", "FP (0)/FN (1)", "FP (0)/FN (2)",
                 "FP (1)/FN (0)", "TP (1)", "FP (1)/FN (2)",
                 "FP (2)/FN (0)", "FP (2)/FN (1)", "TP (2)")
  
  names(flat) <- new_names
  return(flat)
}

# Combine your two sets of model lists into one list
all_conf_mats <- c(conf_mat_with_fine_tuning, conf_mat_no_fine_tuning)

# Extract the flattened confusion matrices for each model and create a data frame
confusion_data <- lapply(names(all_conf_mats), function(model_name) {
  flat <- extract_flat_cm(all_conf_mats[[model_name]])
  # Create a one-row data frame with model name and the flattened values, and disable name sanitization
  data.frame(Name = model_name, t(flat), check.names = FALSE)
})
confusion_df <- do.call(rbind, confusion_data)

# Optionally, filter out models you don't want included
models_to_exclude <- c("Experts 100", "Experts 250", "Research Assistants", 
                       "NGO", "Citizen Science", "Prolific", "Appen", 
                       "chatGPT 100", "chatGPT 250")
confusion_df <- confusion_df %>% filter(!Name %in% models_to_exclude)

# Optionally, sort models (e.g., ensuring "ChatGPT" is listed first)
confusion_df <- confusion_df %>% 
  arrange(match(Name, c("ChatGPT", setdiff(Name, "ChatGPT"))))

# Generate the LaTeX table using xtable
latex_table <- xtable(confusion_df, 
                      caption = "Flattened 3×3 Confusion Matrices for All Models with Meaningful Labels",
                      label = "tab:confusion_matrices",
                      auto = FALSE)
print(latex_table, type = "latex", booktabs = TRUE,
      caption.placement = "top", include.rownames = FALSE, 
      sanitize.text.function = function(x){x})


################################################################################
# Some other things
################################################################################


# Vector with names of the first dataset in each pair
datasets_group1 <- rep("chat_wide_m",  times = 10)
datasets_group1name <- rep("GPT Zero-Shot",  times = 10)
# Vector with names of the second dataset in each pair
datasets_group2 <- c("ngo_df_250", "appen_df_250", "citi_df_250", "prolific_df_250", "ras_df_250", "ngo_df_100", "appen_df_100", "citi_df_100", "prolific_df_100", "ras_df_100")


# Initialize a vector to store agreement rates
agreement_rates <- numeric(length(datasets_group1))
# Loop through the dataset names
for (i in 1:length(datasets_group1)) {
  # Dynamically access datasets using get()
  dataset1 <- get(datasets_group1[i]) %>% mutate(ishatetoxic_combined = ifelse(ishate_combined  == 1 & istoxic_combined == 0, "HATE" ,
                                                                               ifelse(ishate_combined == 1 & istoxic_combined == 1, "TOXIC", "NEITHER")))
  dataset2 <- get(datasets_group2[i])
  
  # Calculate the agreement for the current pair of datasets
  agreement <- dataset1$ishatetoxic_combined == dataset2$prediction
  
  # Calculate and store the agreement rate for the current pair
  agreement_rates[i] <- mean(agreement) * 100
}

# Print the agreement rates for each pair
print(agreement_rates)

# Optionally, you might want to name the elements of agreement_rates for clarity
names(agreement_rates) <- paste(datasets_group1name, datasets_group2, sep=" vs ")
print(agreement_rates)



# Vector with names of the first dataset in each pair
datasets_group1 <- c("ngo_df_100", "appen_df_100", "citi_df_100", "prolific_df_100", "ras_df_100")
datasets_group1name <- c("ngo_df_100", "appen_df_100", "citi_df_100", "prolific_df_100", "ras_df_100")
# Vector with names of the second dataset in each pair
datasets_group2 <- c("ngo_df_250", "appen_df_250", "citi_df_250", "prolific_df_250", "ras_df_250")


# Initialize a vector to store agreement rates
agreement_rates <- numeric(length(datasets_group1))
# Loop through the dataset names
for (i in 1:length(datasets_group1)) {
  # Dynamically access datasets using get()
  dataset1 <- get(datasets_group1[i])
  dataset2 <- get(datasets_group2[i])
  
  # Calculate the agreement for the current pair of datasets
  agreement <- dataset1$prediction == dataset2$prediction
  
  # Calculate and store the agreement rate for the current pair
  agreement_rates[i] <- mean(agreement) * 100
}

# Print the agreement rates for each pair
print(agreement_rates)

# Optionally, you might want to name the elements of agreement_rates for clarity
names(agreement_rates) <- paste(datasets_group1name, datasets_group2, sep=" vs ")
print(agreement_rates)

